In [1]:
import pandas as pd
from pandas.tseries.offsets import DateOffset
import requests
from bs4 import BeautifulSoup
import json
from collections import Counter
from matplotlib import pyplot as plt
from datetime import datetime
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import re
import nltk
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
In [3]:
# Step 0: Load cleaned data from GitHub
import pandas as pd
# Replace the date with the correct one if needed
url = "https://raw.githubusercontent.com/Neeti3107/Foundation-Project_Group-14/main/data/cleaned/retail_sugar_prices_2025-04-15.csv"
# Load the cleaned file
df_filtered = pd.read_csv(url, parse_dates=['date'])
# Quick preview
df_filtered.head()
Out[3]:
| date | admin1 | admin2 | market | market_id | latitude | longitude | category | commodity | commodity_id | unit | priceflag | pricetype | currency | price | usdprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1994-01-15 | Gujarat | Ahmadabad | Ahmedabad | 923 | 23.03 | 72.62 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.5 | 0.43 |
| 1 | 1994-01-15 | Karnataka | Bangalore Urban | Bengaluru | 926 | 12.96 | 77.58 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.2 | 0.42 |
| 2 | 1994-01-15 | Maharashtra | Mumbai city | Mumbai | 955 | 18.98 | 72.83 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.8 | 0.44 |
| 3 | 1994-01-15 | Orissa | Khordha | Bhubaneshwar | 929 | 20.23 | 85.83 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.5 | 0.43 |
| 4 | 1994-01-15 | Tripura | West Tripura | Agartala | 921 | 23.84 | 91.28 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 16.0 | 0.51 |
In [4]:
import pandas as pd
import requests
import re
from datetime import datetime
def get_latest_cleaned_csv_url(user, repo, path="data/cleaned"):
api_url = f"https://api.github.com/repos/{user}/{repo}/contents/{path}"
response = requests.get(api_url)
files = response.json()
csv_files = []
for file in files:
name = file['name']
if name.startswith("retail_sugar_prices_") and name.endswith(".csv"):
match = re.search(r"(\d{4}-\d{2}-\d{2})", name)
if match:
csv_files.append((match.group(1), name))
if not csv_files:
raise ValueError("❌ No cleaned CSV files found.")
# Get latest date
latest_date, latest_file = sorted(csv_files)[-1]
print(f"📁 Latest file found: {latest_file}")
raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/main/{path}/{latest_file}"
return raw_url
# === USE IT ===
user = "Neeti3107"
repo = "Foundation-Project_Group-14"
latest_csv_url = get_latest_cleaned_csv_url(user, repo)
# Load the file
df_filtered = pd.read_csv(latest_csv_url, parse_dates=['date'])
df_filtered.head()
📁 Latest file found: retail_sugar_prices_2025-04-16.csv
Out[4]:
| date | admin1 | admin2 | market | market_id | latitude | longitude | category | commodity | commodity_id | unit | priceflag | pricetype | currency | price | usdprice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1994-01-15 | Gujarat | Ahmadabad | Ahmedabad | 923 | 23.03 | 72.62 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.5 | 0.43 |
| 1 | 1994-01-15 | Karnataka | Bangalore Urban | Bengaluru | 926 | 12.96 | 77.58 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.2 | 0.42 |
| 2 | 1994-01-15 | Maharashtra | Mumbai city | Mumbai | 955 | 18.98 | 72.83 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.8 | 0.44 |
| 3 | 1994-01-15 | Orissa | Khordha | Bhubaneshwar | 929 | 20.23 | 85.83 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 13.5 | 0.43 |
| 4 | 1994-01-15 | Tripura | West Tripura | Agartala | 921 | 23.84 | 91.28 | miscellaneous food | sugar | 97 | KG | actual | retail | INR | 16.0 | 0.51 |
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# --- EDA and Visualizations ---
# Step 1: Sort and create a copy
df_eda = df_filtered[['date', 'price']].copy()
df_eda = df_eda.sort_values('date')
# Step 2: Add Month and Year columns
df_eda['year'] = df_eda['date'].dt.year
df_eda['month'] = df_eda['date'].dt.to_period('M')
# Step 3: Percentage change
df_eda['pct_change'] = df_eda['price'].pct_change() * 100
# Step 4: Monthly average price
df_monthly = df_eda.groupby('month')['price'].mean().reset_index()
df_monthly['month'] = df_monthly['month'].dt.to_timestamp()
# Step 5: Line plot of monthly average price
plt.figure(figsize=(14, 6))
sns.lineplot(data=df_monthly, x='month', y='price', marker='o')
plt.title("Monthly Average Sugar Price (INR)")
plt.xlabel("Month")
plt.ylabel("Average Price (INR)")
plt.grid(True)
plt.tight_layout()
plt.show()
# Step 6: Monthly percentage change
df_monthly['pct_change'] = df_monthly['price'].pct_change() * 100
plt.figure(figsize=(14, 6))
sns.barplot(data=df_monthly, x='month', y='pct_change', color='orange')
plt.title("Monthly Percentage Change in Sugar Price")
plt.xlabel("Month")
plt.ylabel("Percentage Change")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
# Step 7: Price histogram
plt.figure(figsize=(10, 6))
sns.histplot(df_eda['price'], bins=30, kde=True)
plt.title("Distribution of Sugar Prices (Retail)")
plt.xlabel("Price (INR)")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()
# Step 8: Boxplot by year
plt.figure(figsize=(14, 6))
sns.boxplot(x='year', y='price', data=df_eda)
plt.title("Year-wise Distribution of Sugar Prices (Retail)")
plt.xlabel("Year")
plt.ylabel("Price (INR)")
plt.grid(True)
plt.tight_layout()
plt.show()
# Step 9: Heatmap of average price by year and month
df_eda['month_num'] = df_eda['date'].dt.month
pivot_table = df_eda.pivot_table(values='price', index='year', columns='month_num', aggfunc='mean')
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='YlGnBu', annot=True, fmt=".1f")
plt.title("Monthly Sugar Prices Heatmap (Retail, INR)")
plt.xlabel("Month")
plt.ylabel("Year")
plt.tight_layout()
plt.show()
In [6]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
# Ensure 'month' is datetime and sorted
df_monthly = df_monthly.sort_values('month')
# Plot using Matplotlib to keep datetime on x-axis
fig, ax = plt.subplots(figsize=(14, 6))
ax.bar(df_monthly['month'], df_monthly['pct_change'], color='orange')
# Formatting
ax.set_title("Monthly Percentage Change in Sugar Price")
ax.set_xlabel("Year")
ax.set_ylabel("Percentage Change")
# Set x-axis ticks every 5 years from Jan 1994
locator = mdates.YearLocator(base=5, month=1, day=1)
formatter = mdates.DateFormatter('%Y')
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
# Set limits from Jan 1994 to last available date
start_date = pd.Timestamp('1994-01-01')
end_date = df_monthly['month'].max()
ax.set_xlim([start_date, end_date])
plt.xticks(rotation=0)
plt.grid(True)
plt.tight_layout()
plt.show()
In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
# Create time series with datetime index
df_decompose = df_monthly[['month', 'price']].copy()
df_decompose.set_index('month', inplace=True)
df_decompose = df_decompose.sort_index()
# Optional: Interpolate or drop missing values
df_decompose = df_decompose.interpolate()
# Specify period manually (12 for monthly data)
decompose_result = seasonal_decompose(df_decompose['price'], model='additive', period=12)
# Plot the full decomposition
decompose_result.plot()
plt.suptitle("Seasonal Decomposition of Sugar Prices (Additive Model)", fontsize=16)
plt.tight_layout()
plt.show()
# Plot trend separately
plt.figure(figsize=(16, 6))
decompose_result.trend.plot(color='orange', title='Trend Component')
plt.grid(True)
plt.show()
# Plot seasonality separately
plt.figure(figsize=(16, 6))
decompose_result.seasonal.plot(color='green', title='Seasonal Component')
plt.grid(True)
plt.show()
📊 Insights from EDA of Retail Sugar Prices (1994–2025)¶
🧭 1. Overall Trends (Long-Term Behavior)¶
- Sugar prices in India have steadily increased over the last 30 years.
- From around ₹8/kg in 1994, prices have climbed to over ₹45/kg by recent years.
- A strong upward trend is clearly visible in the trend decomposition plot, especially during:
- 2009–2011: Noticeable inflationary spike in sugar prices.
- 2020–2022: Gradual increase likely influenced by pandemic-era disruptions.
📈 2. Monthly Average & Seasonality¶
- There is seasonal fluctuation in sugar prices, repeating every year.
- Prices tend to drop in early months (April–May) and rise again toward the year-end.
- This pattern suggests:
- A possible link to agricultural harvest cycles or festive demand variations.
- E.g., Diwali or end-of-year celebrations may create demand surges.
🧮 3. Monthly Percentage Change¶
- Most months show mild percentage changes (<5%), but a few months spike or drop sharply:
- These may correspond to policy changes, import/export controls, or supply shocks.
- Sudden changes often align with known economic events or weather-related impacts.
📦 4. Distribution Insights¶
- The most common sugar price historically was ₹8, which occurred 21 times — mostly in early years.
- Overall, sugar prices follow a right-skewed distribution with a concentration of values between ₹20–₹40/kg.
- This skew indicates gradual but consistent inflation in consumer sugar pricing.
📅 5. Year-wise Variability¶
- The boxplot shows wider price ranges in later years (post-2010), suggesting:
- Increased volatility in the market.
- Possibly driven by global market influences, fuel costs, or climate-driven variability.
🌡️ 6. Heatmap View¶
- The heatmap confirms a repeating seasonal cycle:
- Sugar prices are lower in mid-year months and higher towards the end/start of each year.
- 2009, 2016, and 2020 stand out with unusual spikes, possibly due to external shocks.
In [8]:
# Check Which States Have the Most Complete Sugar Price Data
# Add year-month for aggregation
df_filtered['year_month'] = df_filtered['date'].dt.to_period('M').dt.to_timestamp()
# Count how many months of data are available per state
state_monthly_counts = (
df_filtered.groupby(['admin1', 'year_month'])['price']
.count()
.reset_index(name='count')
)
# Pivot to create heatmap-like matrix: rows=state, columns=month, values=presence (0/1)
state_presence = state_monthly_counts.copy()
state_presence['present'] = 1 # Mark available data
heatmap_df = state_presence.pivot(index='admin1', columns='year_month', values='present').fillna(0)
# Optional: sort states by number of months available
heatmap_df['total_months'] = heatmap_df.sum(axis=1)
heatmap_df = heatmap_df.sort_values('total_months', ascending=False).drop(columns='total_months')
# Plot heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_df, cmap='Greens', cbar=False, linewidths=0.1)
plt.title("📊 State-wise Monthly Data Completeness for Retail Sugar Prices")
plt.xlabel("Year-Month")
plt.ylabel("State (admin1)")
plt.tight_layout()
plt.show()
# Print summary table: states with most and least data
summary = (
df_filtered.groupby('admin1')['year_month']
.nunique()
.sort_values(ascending=False)
.reset_index(name='months_of_data')
)
print("Top states by data completeness:\n", summary.head(10))
print("\nBottom states by data completeness:\n", summary.tail(10))
C:\Users\neeti\AppData\Local\Temp\ipykernel_11328\757177853.py:28: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
plt.tight_layout()
C:\Users\neeti\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
Top states by data completeness:
admin1 months_of_data
0 Maharashtra 257
1 Tamil Nadu 251
2 Rajasthan 241
3 Orissa 240
4 Himachal Pradesh 231
5 Uttar Pradesh 230
6 Karnataka 223
7 Madhya Pradesh 219
8 Kerala 217
9 Bihar 215
Bottom states by data completeness:
admin1 months_of_data
21 Uttarakhand 112
22 Andhra Pradesh 111
23 Andaman and Nicobar 93
24 Chandigarh 92
25 Nagaland 92
26 Puducherry 79
27 Goa 72
28 Chhattisgarh 34
29 Sikkim 21
30 Manipur 13
In [9]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Create pivot table (rows = date, columns = states)
pivot_prices = (
df_filtered.groupby(['year_month', 'admin1'])['price']
.mean()
.unstack()
)
# Step 2: Interpolate missing values (linear within time)
pivot_prices_interp = pivot_prices.interpolate(method='linear', limit_direction='both')
# Optional: Only keep states with at least 180 months (15 years) of data
valid_states = pivot_prices_interp.columns[pivot_prices_interp.notna().sum() >= 180]
pivot_prices_interp = pivot_prices_interp[valid_states]
# Step 3: Standardize (each state's time series gets mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pivot_prices_interp.fillna(0).T) # Transpose: states as rows
# Step 4: Apply KMeans clustering (you can tune 'n_clusters')
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
# Add cluster labels to state names
cluster_df = pd.DataFrame({
'state': pivot_prices_interp.columns,
'cluster': clusters
}).sort_values('cluster')
print(cluster_df)
# Step 5: Optional PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='Set2', s=100)
for i, state in enumerate(pivot_prices_interp.columns):
plt.text(X_pca[i, 0], X_pca[i, 1], state, fontsize=9, ha='right')
plt.title("📍 State Clustering by Sugar Price Pattern (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\neeti\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\neeti\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1440: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
state cluster 15 Maharashtra 0 28 Uttar Pradesh 0 26 Telangana 0 25 Tamil Nadu 0 23 Rajasthan 0 20 Orissa 0 14 Madhya Pradesh 0 13 Kerala 0 12 Karnataka 0 11 Jharkhand 0 30 West Bengal 0 8 Gujarat 0 6 Delhi 0 2 Assam 0 3 Bihar 0 4 Chandigarh 0 10 Himachal Pradesh 0 0 Andaman and Nicobar 1 16 Manipur 1 24 Sikkim 1 1 Andhra Pradesh 2 22 Punjab 2 9 Haryana 2 29 Uttarakhand 2 7 Goa 2 21 Puducherry 2 5 Chhattisgarh 2 19 Nagaland 3 18 Mizoram 3 17 Meghalaya 3 27 Tripura 3
C:\Users\neeti\AppData\Local\Temp\ipykernel_11328\1823660468.py:49: UserWarning: Glyph 128205 (\N{ROUND PUSHPIN}) missing from current font.
plt.tight_layout()
C:\Users\neeti\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 128205 (\N{ROUND PUSHPIN}) missing from current font.
fig.canvas.print_figure(bytes_io, **kw)
🧠 Cluster Interpretations¶
✅ Cluster 0 – Majority Group (Stable/Aligned Trend)¶
States: Maharashtra, Uttar Pradesh, Tamil Nadu, Rajasthan, Karnataka, Kerala, Madhya Pradesh, Gujarat, etc.
- These states show similar seasonal patterns and consistent data availability.
- Likely follow national sugar price trends.
- Suitable for country-level modeling or selecting a representative sample group.
🌴 Cluster 1 – Sparse/Irregular or Island Territories¶
States: Andaman & Nicobar, Manipur, Sikkim
- Often have sparse data or irregular price patterns.
- May have unique supply chains (e.g., non-agricultural or import-dependent).
- Not ideal for standard trend modeling without adjustment.
⚡ Cluster 2 – Semi-distinct / Volatile Trends¶
States: Andhra Pradesh, Punjab, Haryana, Goa, Puducherry, Chhattisgarh
- Show greater price volatility or regional fluctuations.
- Could be influenced by local governance, infrastructure, or supply-demand issues.
- May need separate forecasting models or volatility handling.
🏔️ Cluster 3 – Northeast Focused Outliers¶
States: Nagaland, Mizoram, Meghalaya, Tripura
- Exhibit distinctive pricing behavior compared to mainland states.
- Influenced by transport costs, geography, or border trade policies.
- Important to treat as a unique regional group in analysis.
In [10]:
pip install plotly geopandas
Requirement already satisfied: plotly in c:\users\neeti\anaconda3\lib\site-packages (5.22.0)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: geopandas in c:\users\neeti\anaconda3\lib\site-packages (1.0.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\neeti\anaconda3\lib\site-packages (from plotly) (8.2.2) Requirement already satisfied: packaging in c:\users\neeti\anaconda3\lib\site-packages (from plotly) (23.2) Requirement already satisfied: numpy>=1.22 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (1.26.4) Requirement already satisfied: pyogrio>=0.7.2 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (0.10.0) Requirement already satisfied: pandas>=1.4.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (2.2.2) Requirement already satisfied: pyproj>=3.3.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (3.7.1) Requirement already satisfied: shapely>=2.0.0 in c:\users\neeti\anaconda3\lib\site-packages (from geopandas) (2.1.0) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\neeti\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2023.3) Requirement already satisfied: certifi in c:\users\neeti\anaconda3\lib\site-packages (from pyogrio>=0.7.2->geopandas) (2024.7.4) Requirement already satisfied: six>=1.5 in c:\users\neeti\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.4.0->geopandas) (1.16.0)
In [11]:
import pandas as pd
import plotly.express as px
import json
import requests
# Your state-cluster data
state_cluster_df = pd.DataFrame({
'state': [
'Maharashtra', 'Uttar Pradesh', 'Telangana', 'Tamil Nadu', 'Rajasthan',
'Orissa', 'Madhya Pradesh', 'Kerala', 'Karnataka', 'Jharkhand',
'West Bengal', 'Gujarat', 'Delhi', 'Assam', 'Bihar', 'Chandigarh',
'Himachal Pradesh', 'Andaman and Nicobar', 'Manipur', 'Sikkim',
'Andhra Pradesh', 'Punjab', 'Haryana', 'Uttarakhand', 'Goa',
'Puducherry', 'Chhattisgarh', 'Nagaland', 'Mizoram', 'Meghalaya', 'Tripura'
],
'cluster': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3]
})
# Load India states GeoJSON
# Source: https://github.com/geohacker/india/blob/master/state/india_telengana.geojson
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json' # For US by default
india_geojson_url = 'https://raw.githubusercontent.com/geohacker/india/master/state/india_telengana.geojson'
india_states = requests.get(india_geojson_url).json()
# Fix names to match if needed
state_cluster_df['state'] = state_cluster_df['state'].str.title()
# Plotly Choropleth
fig = px.choropleth_mapbox(
state_cluster_df,
geojson=india_states,
featureidkey='properties.NAME_1',
locations='state',
color='cluster',
color_continuous_scale='Viridis',
mapbox_style='carto-positron',
center={"lat": 23.5937, "lon": 80.9629},
zoom=3.8,
opacity=0.7,
title="State-wise Clustering Based on Sugar Price Patterns"
)
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()